function Reward=OurPolicy1(N, M, P0, P0next, P0prev, P1, P1next, P1prev, H, gamma, epsilon,R, T)

%% reward initialization
Reward=zeros(1,T); 
%% initialization of counts 
C0=ones(1, T);
C0prev=ones(1, T);
C1=ones(1, T);
C1next=ones(1, T);

%% confidence bound parameter 
eta=0.05;
num=sqrt(2*log10(N*T/eta));

%% Bandit-over Bandit parameter
eps=0.1:0.1:1; % set of possible values
weight=ones(1, length(eps)); % weight initialization for EXP3
alpha=0.1;

%% Initial probability 
for i=1:N
      Q0next=0;
      Q0prev=0.5;
      Q0=1-Q0next-Q0prev;
      Q1next=0.5;
      Q1prev=0;
      Q1=1-Q1next-Q1prev;
      Q0Last=0;
      Q1Last=0;
end

Whittle=zeros(N, length(R));

for t=1:T
    %% Window Selection 
    epsProb=(1-alpha)*weight/sum(weight)+alpha/length(eps); % EXP3 algorithm
    epsilon=eps(find(rand<cumsum(epsProb),1,'first')); % EXP3 algorithm
    win=ceil((1/epsilon));


    %% Whittle Index Calculation
    state=ones(1, N);
    for i=1:N
     Whittle(i,:)=WhittleIndex1(R, Q0, 0, Q0prev, 0, Q1, Q1next, 0, 0,  gamma);
%     if i<N
%     Whittle(i,:)=WhittleIndex1(R, Q0, 0, Q0prev, 0, Q1, Q1next, 0, 0,  gamma);
%     else
%     Whittle(i,:)=WhittleIndex1(R, P0(i, t), P0next(i, t), P0prev(i, t), 0, P1(i, t), P1next(i, t), P1prev(i, t), 0,  gamma);
%     end
    end
    
    %% Episodes
    for h=1:H
        
        %% action selection 
        a=zeros(1, N);
        for i=1:N
           W(i)=Whittle(i,state(i));
        end
        
        [~, index]=maxk(W, M);
        a(index)=1;

        %% State Transitions
        for i=1:N
            s=state(i);
            randnumber=rand;
            if a(i)==1
                if randnumber<=P1(i, t)
                    state(i)=state(i);
                    if i>N/2
                    C1(t)=C1(t)+1;
                    end
                    
                else 
                    if i>N/2
                    C1next(t)=C1next(t)+1;
                    end
                    if s<length(R)
                       state(i)=state(i)+1;
                    else
                        state(i)=state(i);
                    end
                end
            else
                if randnumber<=P0(i, t)
                    state(i)=state(i);
                    if i>N/2
                    C0(t)=C0(t)+1;
                    end
                else
                    if i>N/2
                    C0prev(t)=C0prev(t)+1;
                    end
                    if s>1
                       state(i)=state(i)-1;
                    else
                       state(i)=state(i);
                    end
                end
            end
        end
        %% Reward Calculation 
        for i=1:N
        Reward(t)=Reward(t)+gamma^(h-1)*R(state(i));
        end
    end

        %% Optimistic Probability 
        lw=max(t-win+1,1);
        c=sum(C1(lw:t));
        cnext=sum(C1next(lw:t));
        delta=num/sqrt((c+cnext));
        Q1next=min(c/(c+cnext)+delta, 1);
        Q1=1-Q1next;
       
        
        c=sum(C0(lw:t));
        cprev=sum(C0prev(lw:t));
        delta=num/sqrt((c+cprev));
        Q0=min(c/(c+cprev)+delta+win*epsilon, 1);
        Q0prev=1-Q0;
  
         %% Weight update for Bandit-over Bandit algorithm 
         for i=1:length(eps)
              if eps(i)==epsilon
                 Rx=Reward(t)/(H*9*N);
                 scale=(alpha*Rx)/(epsProb(i)*length(eps));
                 weight(i)=weight(i)*exp(scale);
              end
        end

end
   
end

